## ----------------------------------------------------------------------------
## Title: 03_descriptives.R
## Author: Elsa Voytas
## Created: May-23-2024
## Last updated: April-13-2025
## ----------------------------------------------------------------------------

library(dplyr)
library(readr)
library(stringi)
library(zoo)
library(rio)

# ------------------------------------------------------------------------------
# Step 1: Producing individual-level reparations dataset
# ------------------------------------------------------------------------------

# read in voting data - this is voting data from before registration became automatic
voting <- import("Input/padron2010.dta") # This file contains private, personal data and is not included in replication materials
Encoding(voting$nombre) <-'latin1' # set encoding
voting$nombre <- stri_trans_general(voting$nombre,"Latin-ASCII")

# de-dupe and rename name variable
voting <- voting %>%
  distinct() %>%
  rename(name = nombre,
         rut = cedula,
         dv = digito)

# read in IPS data
ips <- import("Input/nomina surviving victims.xlsx") # This file contains private, personal data and is not included in replication materials
ips <- ips %>%
  na.omit() #removing empty rows and 2 observations without reparations date

ips$name <- stri_trans_general(ips$name,"Latin-ASCII")

rettig <- import("Input/nomina rettig.xlsx") # This file contains private, personal data and is not included in replication materials
rettig <- rettig %>%
  na.omit() #removing empty rows and observations without reparations date

# Figure from page 11
nrow(ips)/(nrow(rettig) + nrow(ips))
# 0.9169387

# convert date variables to consistent format (https://stackoverflow.com/questions/13764514/how-to-change-multiple-date-formats-in-same-column)
a <- as.Date(ips$date_determined,format="%d-%m-%Y") # Produces NA when format is not this
ips$date_determined <- as.numeric(as.character(ips$date_determined))
b <- as.Date(ips$date_determined,origin="1899-12-30") # Produces NA when format is not this
a[is.na(a)] <- b[!is.na(b)] # Combine formats, preserve order
ips$date_determined <- a # Put back in dataframe

# De-dupe and filter our 4 people who got reps twice
ips <- ips %>%
  distinct() %>%
  arrange(date_determined) %>%
  distinct(name, .keep_all = TRUE) 

# read in IPS data (covariates)
covs1 <- import("Input/Pension anual de Reparacion de la Ley 19.992 Valech, Rettig.xlsx") # This file contains private, personal data and is not included in replication materials
covs2 <- import("Input/Pension anual de Reparacion de la Ley 19.992 Valech II, Rettig.xlsx") # This file contains private, personal data and is not included in replication materials
covs <- bind_rows(covs1, covs2)
covs <- covs %>%
  mutate(Nombre = stri_trans_general(Nombre,"Latin-ASCII"),
         fnacim = as.character(F_Nac)) %>%
  rename(name = Nombre) %>%
  distinct() %>%
  distinct(name, F_Nac, .keep_all = T) %>%# filter duplicates
  mutate(name = gsub("\\s+"," ",name),
         birthyear = substr(fnacim, 1, 4))

voting <- voting %>% # dropping two rows that did not convert correctly due to special characters
  slice(-c(4269226,7060144)) %>% # this is inconsequential; neither received reparations or was a victim
  mutate(birthyear = substr(fnacim, 1, 4))

# Merge IPS reparations data and covariates data by name
first.df <- ips %>%
  left_join(covs, by='name') %>%
  distinct()

# when we have IPS data about name and birthdate, merge on both fields with voting data
wcovs <- first.df %>%
  filter(!is.na(F_Nac)) %>%
  left_join(voting, by=c('name','birthyear')) %>%
  distinct()

# there are some people who appear in IPS list but no covariates
# merge with voting data on name
wocovs <- first.df %>%
  filter(is.na(F_Nac)) %>%
  left_join(voting, by="name") %>%
  distinct() %>%
  mutate(birthyear = birthyear.y) %>%
  select(-c(birthyear.x, birthyear.y))

# bind people w/ and w/o covariates
first.df <- bind_rows(wcovs, wocovs)

# there are a handful of instances where reparations data 
# return more than one match in the voter file. 
# this code identifies duplicated entries by name & IPS birthdate
# -- some of these arise from people sharing names, while others are real duplicates
repeat.names <- first.df %>%
  group_by(name, F_Nac) %>%
  mutate(n=n()) %>%
  filter(n>1) %>%
  filter(n_distinct(finscrip) != 1) %>%
  ungroup()

# I assume that if the duplicated entries share both a name and exact birthdate,
# they are referring to the same person. Filtering to earliest voter registration
# date within name/birthdate.
voting.dups <- first.df %>%
   group_by(name, F_Nac) %>%
  mutate(n=n()) %>%
  filter(n>1) %>%
  filter(n_distinct(finscrip) != 1) %>%
  filter(fnacim.x == fnacim.y) %>%
  filter(finscrip == min(finscrip)) %>%
  dplyr::select(-n) %>%
  ungroup()

# identifying those with names that repeat in the voter file and no
# additional information -- this makes it impossible to identify WHICH
# voter file entry belongs to whom
twice <- repeat.names$name[!repeat.names$name %in% voting.dups$name]

first.df <- first.df %>% anti_join(voting.dups, by = c("name", "F_Nac"))
first.df <- first.df %>%
  distinct() %>%
  filter(!(name %in% twice)) %>%# filtering out those for whom we do not know which voter registration applies
  distinct(name, F_Nac, .keep_all = T) 

first.df <- rbind(first.df, voting.dups)

# creating covariates and date variables
first.df$female <- ifelse(first.df$genero=="F",1,0) # fill in sex with IPS data first
first.df$female <- ifelse(is.na(first.df$genero)&first.df$sexo==0,1,first.df$female) # if IPS data is missing, use voting variable
first.df$female <- ifelse(is.na(first.df$genero)&first.df$sexo==1,0,first.df$female)
first.df <- first.df %>%
  mutate(F_Nac = as.Date(as.character(F_Nac), format="%Y%m%d"))#getting birthdate
first.df <- first.df %>%
  mutate(birthyr = format(as.Date(F_Nac), "%Y"))
first.df$birthyr <- ifelse(is.na(first.df$F_Nac)&!is.na(first.df$birthyear),
                           first.df$birthyear,first.df$birthyr) # using IPS covariate data or birthyear data from voting record if NA
first.df$age <- 2005 - as.numeric(first.df$birthyr) # creating age variable (using 2005, which was when reparations started)

first.df <- first.df %>%
  mutate(StartDate = min(date_determined), # first date when reparations are issued
         age.at.first =StartDate-F_Nac,
         finscrip = as.Date(finscrip, format="%Y%m%d"),
         EndDate = as.Date(max(finscrip,na.rm=T))) # end date is last day that we have record of voter registration

# removing underage ppl
underage <- first.df %>%
  filter(age.at.first < 6574.5) # 13 ppl were underage when they first got reparations - we drop them since they are ineligible to register to vote
first.df <- first.df %>%
  filter(!name %in% underage$name) 

# filtering out duplicate rows, name/birthdate
first.df <- first.df %>%
  distinct()

first.df <- first.df %>%
  mutate(ID = 1:length(first.df$name), # create ID instead of using name
         StartDate = as.yearmon(StartDate, format = "%Y-%m"),
         EndDate = as.yearmon(EndDate, format = "%Y-%m"))

first.df$StartDate <- as.Date(first.df$StartDate)
# merge in urban/rural data for later
urban <- read_csv("Input/urban.csv") # data from https://redatam-ine.ine.cl/redbin/RpWebEngine.exe/Portal?BASE=CENSO_2002&lang=esp
urban <- urban %>%
  mutate(comuna = toupper(stri_trans_general(str = urban$`Unidad territorial`, id = "Latin-ASCII")))
full.df <- left_join(first.df, urban, by="comuna")

# read in province data

comuna <- read_csv("Input/comunas.csv")
comuna$comuna <- stri_trans_general(comuna$comuna,"Latin-ASCII")
comuna$comuna <- toupper(comuna$comuna)
comuna <- comuna %>%
  mutate(comuna = ifelse(comuna == "COIHAIQUE", "COYHAIQUE",
                         ifelse(comuna=="AISEN", "AYSEN",
                                ifelse(comuna == "ISLA  DE PASCUA", "ISLA DE PASCUA", comuna))))
full.df <- full.df %>%
  left_join(comuna)
full.df$urban <- as.numeric(full.df$`2002`)
full.df$urbandummy <- as.numeric(ifelse(full.df$urban>=86.6,1,0))

# ------------------------------------------------------------------------------
## Table 1: Summary statistics for demographic, independent, & dependent variables
# ------------------------------------------------------------------------------
# Producing Total Sample figures

# Age at time of Valech release among total sample
2004-mean(as.numeric(full.df$birthyr),na.rm=T)
2004-min(as.numeric(full.df$birthyr),na.rm=T)
2004-max(as.numeric(full.df$birthyr),na.rm=T)

# Gender breakdown
prop.table(table(full.df$female,exclude=NULL))

# Province breakdown
sort(prop.table(table(full.df$provincia, exclude=NULL)), decreasing = TRUE)[1:3]

# range and mean approval dates
mean(full.df$date_determined)
min(full.df$date_determined)
max(full.df$date_determined)

# range and mean registration dates
mean(full.df$finscrip, na.rm=T)
min(full.df$finscrip, na.rm = T)
max(full.df$finscrip, na.rm = T)

# N
nrow(full.df)

# Producing Narrowed Sample figures
# Age at time of Valech release among total sample
narrowed.df1 <- full.df[which(is.na(full.df$finscrip)|full.df$finscrip>min(full.df$date_determined)),]
narrowed.df <- narrowed.df1[which(narrowed.df1$date_determined<"2010-09-01"),]

# age
2004-mean(as.numeric(narrowed.df$birthyr), na.rm = T)
2004-min(as.numeric(narrowed.df$birthyr),na.rm=T)
2004-max(as.numeric(narrowed.df$birthyr),na.rm=T)

# gender
prop.table(table(narrowed.df$female, exclude=NULL))

# provinces
sort(prop.table(table(narrowed.df$provincia, exclude=NULL)), decreasing = TRUE)[1:3]

# range and mean approval dates
mean(narrowed.df$date_determined)
min(narrowed.df$date_determined)
max(narrowed.df$date_determined)

# range and mean registration dates
mean(narrowed.df$finscrip, na.rm=T)
min(narrowed.df$finscrip, na.rm = T)
max(narrowed.df$finscrip, na.rm = T)

# N
nrow(narrowed.df)

print("Table 1 complete")

# ----------------------------
# Main analysis data prep
# ----------------------------

# setting parameters of final sample
narrowed.df$age.bucket <- ntile(narrowed.df$birthyr, 3)

## Merge in comuna-income level
# read in CASEN 2003 from https://pacha.dev/casen/
casen03 <- readRDS("Input/2003.rds")

income <- casen03 %>%
  group_by(comu) %>%
  dplyr::summarize(avginctot = mean(ytothaj, na.rm=T)) %>%
  ungroup()
income$comu <- as.character(income$comu)

#comuna$comu <- sub("^0+", "", comuna$comu)
casenlink <- read_csv("Input/casen link.csv")
casenlink$comu <- as.character(casenlink$comu)

income <- income %>%
  right_join(casenlink)

income$comuna <- toupper(income$comuna)
income$comuna <- stri_trans_general(income$comuna,"Latin-ASCII")

nat.terciles <- quantile(income$avginctot, probs = c(1/3, 2/3), na.rm = TRUE)
income$nat.tercile <- cut(
  income$avginctot,
  breaks = c(-Inf, nat.terciles[1], nat.terciles[2], Inf),
  labels = c("Low", "Middle", "High"),
  right = TRUE)

income$nat.half <- ifelse(income$avginctot <= 
                            median(income$avginctot, na.rm=T), "Low", "High")



income <- income %>%
  mutate(comuna = ifelse(comuna == "CALERA", "LA CALERA",
                         ifelse(comuna=="LLAY LLAY", "LLAILLAY",
                                ifelse(comuna == "ISLA  DE PASCUA", "ISLA DE PASCUA", comuna))))

first.df <- narrowed.df %>%
  left_join(income, by = "comuna")

buckets <- quantile(first.df$avginctot, probs = c(1/3, 2/3), na.rm = TRUE)

first.df$bucket <- cut(
  first.df$avginctot,
  breaks = c(-Inf, buckets[1], buckets[2], Inf),
  labels = c("1", "2", "3"),
  right = TRUE)

panel <- first.df %>%
  group_by(name, F_Nac) %>%
  mutate(time = list(seq.Date(as.Date(StartDate), as.Date(EndDate), by = "month"))) %>%
  tidyr::unnest() %>%
  ungroup()

# creating treatment and outcome variables
panel <- panel %>%
  mutate(vote = ifelse(finscrip>time,0,1)) # if voter registration date is after row time, code 0, otw 1
panel$vote[is.na(panel$vote)] <- 0 # if NA, that means registration date is NA (unregistred), so code as 0
panel <- panel %>%
  mutate(rep.date = date_determined,
    reparation = ifelse(rep.date > time, 0,1)) # if reparations date is after row time, code 0, otw 1- should this start one period before?

# create time ID (numeric variable for use in packages)

times <- data.frame(time = sort(unique(panel$time)),
                    time.id = 1:length(unique(panel$time)))
panel.df <- merge(panel, times, by = "time") %>%
  distinct(name, time.id, .keep_all = T) %>%
  group_by(name) %>%
  mutate(firstrep = time.id[which.max(reparation)]) %>%
  ungroup()

# save individual-level dataframe
write.csv(panel.df, "Input/panel.df.csv")
write.csv(full.df, "Input/full.df.csv")
write.csv(first.df, "Input/sample.df.csv")
write.csv(voting, "Input/voting.csv")

